{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer as TF\n",
    "from sklearn.naive_bayes import MultinomialNB as MNB\n",
    "from sklearn.linear_model import LogisticRegression as LR\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from conf import config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(origin_text):\n",
    "    # 去掉html标签\n",
    "    text = BeautifulSoup(origin_text).get_text()\n",
    "    # 去掉标点符号和非法字符\n",
    "    text = re.sub(\"[^a-zA-Z]\", \" \", text)\n",
    "    # 将字符全部转化为小写，并通过空格符进行分词处理\n",
    "    words = text.lower().split()\n",
    "    # 去停用词\n",
    "    stop_words = set(stopwords.words(\"english\"))\n",
    "    meaningful_words = [w for w in words if w not in stop_words]\n",
    "    # 将剩下的词还原成str类型\n",
    "    cleaned_text = \" \".join(meaningful_words)\n",
    "    return cleaned_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>location</th>\n",
       "      <th>department</th>\n",
       "      <th>salary_range</th>\n",
       "      <th>company_profile</th>\n",
       "      <th>description</th>\n",
       "      <th>requirements</th>\n",
       "      <th>benefits</th>\n",
       "      <th>telecommuting</th>\n",
       "      <th>has_company_logo</th>\n",
       "      <th>has_questions</th>\n",
       "      <th>employment_type</th>\n",
       "      <th>required_experience</th>\n",
       "      <th>required_education</th>\n",
       "      <th>industry</th>\n",
       "      <th>function</th>\n",
       "      <th>fraudulent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Office Manager</td>\n",
       "      <td>GR, I, Athens</td>\n",
       "      <td>Administration</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Workable is a venture-backed startup making cl...</td>\n",
       "      <td>We are looking for an experienced office manag...</td>\n",
       "      <td>Excellent use of English, speaking and writing...</td>\n",
       "      <td>Our goal is to create a company where employee...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Front End Engineer</td>\n",
       "      <td>US, CA, Emeryville</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Abakus is a software platform built on game th...</td>\n",
       "      <td>As Front End Engineer you will be leading the ...</td>\n",
       "      <td>3-5 years of UI/UX development experience with...</td>\n",
       "      <td>Competitive salaryStock optionsComprehensive b...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Full-time</td>\n",
       "      <td>Mid-Senior level</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computer Software</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Software Engineer</td>\n",
       "      <td>US, MA, Wilmington</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0-130000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Our client is one of the leading SaaS-based pr...</td>\n",
       "      <td>At least 5 years experience developing large-s...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Full-time</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Bachelor's Degree</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Product Manager</td>\n",
       "      <td>US, CA, Manhattan Beach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>The Boston Consulting Group (#URL_45423e1e1670...</td>\n",
       "      <td>We are looking for a Product Manager to be a p...</td>\n",
       "      <td>BASIC JOB REQUIREMENTS:Bachelors Degree form a...</td>\n",
       "      <td>The Boston Consulting Group (BCG) is a global ...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Full-time</td>\n",
       "      <td>Associate</td>\n",
       "      <td>Bachelor's Degree</td>\n",
       "      <td>Computer Software</td>\n",
       "      <td>Product Management</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Sales Intern</td>\n",
       "      <td>US, NY, New York</td>\n",
       "      <td>Sales</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lean Startup Machine has trained over 25,000 a...</td>\n",
       "      <td>Experience in sales preferredInterest in tech ...</td>\n",
       "      <td>Be part of a growing and global team that has ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Part-time</td>\n",
       "      <td>Internship</td>\n",
       "      <td>Some College Coursework Completed</td>\n",
       "      <td>Management Consulting</td>\n",
       "      <td>Sales</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                title                 location      department salary_range  \\\n",
       "0      Office Manager            GR, I, Athens  Administration          NaN   \n",
       "1  Front End Engineer       US, CA, Emeryville     Engineering          NaN   \n",
       "2   Software Engineer       US, MA, Wilmington             NaN     0-130000   \n",
       "3     Product Manager  US, CA, Manhattan Beach             NaN          NaN   \n",
       "4        Sales Intern         US, NY, New York           Sales          NaN   \n",
       "\n",
       "                                     company_profile  \\\n",
       "0  Workable is a venture-backed startup making cl...   \n",
       "1  Abakus is a software platform built on game th...   \n",
       "2                                                NaN   \n",
       "3  The Boston Consulting Group (#URL_45423e1e1670...   \n",
       "4                                                NaN   \n",
       "\n",
       "                                         description  \\\n",
       "0  We are looking for an experienced office manag...   \n",
       "1  As Front End Engineer you will be leading the ...   \n",
       "2  Our client is one of the leading SaaS-based pr...   \n",
       "3  We are looking for a Product Manager to be a p...   \n",
       "4  Lean Startup Machine has trained over 25,000 a...   \n",
       "\n",
       "                                        requirements  \\\n",
       "0  Excellent use of English, speaking and writing...   \n",
       "1  3-5 years of UI/UX development experience with...   \n",
       "2  At least 5 years experience developing large-s...   \n",
       "3  BASIC JOB REQUIREMENTS:Bachelors Degree form a...   \n",
       "4  Experience in sales preferredInterest in tech ...   \n",
       "\n",
       "                                            benefits  telecommuting  \\\n",
       "0  Our goal is to create a company where employee...              0   \n",
       "1  Competitive salaryStock optionsComprehensive b...              0   \n",
       "2                                                NaN              0   \n",
       "3  The Boston Consulting Group (BCG) is a global ...              0   \n",
       "4  Be part of a growing and global team that has ...              0   \n",
       "\n",
       "   has_company_logo  has_questions employment_type required_experience  \\\n",
       "0                 1              1             NaN                 NaN   \n",
       "1                 1              1       Full-time    Mid-Senior level   \n",
       "2                 0              0       Full-time                 NaN   \n",
       "3                 1              0       Full-time           Associate   \n",
       "4                 0              1       Part-time          Internship   \n",
       "\n",
       "                  required_education               industry  \\\n",
       "0                                NaN                    NaN   \n",
       "1                                NaN      Computer Software   \n",
       "2                  Bachelor's Degree                    NaN   \n",
       "3                  Bachelor's Degree      Computer Software   \n",
       "4  Some College Coursework Completed  Management Consulting   \n",
       "\n",
       "             function  fraudulent  \n",
       "0                 NaN           0  \n",
       "1         Engineering           0  \n",
       "2                 NaN           0  \n",
       "3  Product Management           0  \n",
       "4               Sales           0  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv('../data/train.csv')\n",
    "test_df = pd.read_csv('../data/test.csv')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test(row):\n",
    "    columns = ['company_profile', 'description', 'requirements', 'benefits']\n",
    "    ans = ''\n",
    "    for column in columns:\n",
    "        if not pd.isnull(row[column]):\n",
    "                ans += row[column]\n",
    "    return ans    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['text'] = train_df.apply(lambda row: test(row), axis=1)\n",
    "test_df['text'] = test_df.apply(lambda row: test(row), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))\n",
    "test_df['text'] = test_df['text'].apply(lambda x: clean_text(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    long term well established optometric practice...\n",
       "1    full service marketing staffing firm serving c...\n",
       "2    globally connected world forced businesses ret...\n",
       "3    indicative changing way internet business make...\n",
       "4    eroad established modernise new zealand paper ...\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df['text'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating the tfidf vector...\n",
      "\n",
      "(17680, 5000)\n",
      "(200, 5000)\n"
     ]
    }
   ],
   "source": [
    "train_df = train_df.sample(frac=1).reset_index(drop=True)\n",
    "tfidf = TF(analyzer=\"word\",\n",
    "           tokenizer=None,\n",
    "           preprocessor=None,\n",
    "           stop_words=None,\n",
    "           max_features=5000)\n",
    "\n",
    "# 数据向量化\n",
    "print(\"Creating the tfidf vector...\\n\")\n",
    "tfidf.fit(train_df['text'])\n",
    "x_train = tfidf.transform(train_df['text'])\n",
    "x_train = x_train.toarray()\n",
    "\n",
    "x_test = tfidf.transform(test_df['text'])\n",
    "x_test = x_test.toarray()\n",
    "\n",
    "print(x_train.shape)\n",
    "print(x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train_df[\"fraudulent\"]\n",
    "# x_train, x_val, fraudulent, y_cal = train_test_split(x_train, y_train, test_size=0.2, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({0: 16914, 1: 766})\n"
     ]
    }
   ],
   "source": [
    "print(Counter(y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = LR(solver='liblinear')\n",
    "# model.fit(x_train, y_train)\n",
    "model.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10折交叉验证：\n",
      "0.9743212669683258\n"
     ]
    }
   ],
   "source": [
    "print(\"10折交叉验证：\")\n",
    "print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring=\"accuracy\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  pred\n",
       "0   1     0\n",
       "1   2     1\n",
       "2   3     0\n",
       "3   4     0\n",
       "4   5     0"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = model.predict(x_test)\n",
    "submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})\n",
    "submission['id'] = submission['id'] + 1\n",
    "submission.to_csv(\"../data/ml_submission.csv\", index=False, header=False)\n",
    "submission.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
